Normalize reads and create DGE objects
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(edgeR)
Loading required package: limma
files <- dir("../input/Kallisto_output/", include.dirs = TRUE)
files %>% head()
[1] "1a1_q_002_S1_R1_001" "1a2_q_003_S2_R1_001" "1a3_q_004_S3_R1_001" "1a4_q_005_S4_R1_001"
[5] "1a5_q_006_S5_R1_001" "1a6_q_007_S6_R1_001"
counts.list <- map(files, ~ read_tsv(
file=file.path("..","input","Kallisto_output",.,"abundance.tsv"),
col_types = "cdddd"))
names(counts.list) <- files
counts <- sapply(counts.list, select, est_counts) %>%
bind_cols(counts.list[[1]][,"target_id"],.)
counts[is.na(counts)] <- 0
colnames(counts) <- sub(".est_counts","",colnames(counts),fixed = TRUE)
counts
write_csv(counts,"../output/2018-timecourse_V3.0_raw_counts_.csv.gz")
key <- readxl::read_excel("../input/tube_no_legend_time_course_2018.xlsx",
na=c("","na"),
col_types=c("text", "text", "text", "skip", "text", "skip", "skip", "skip", "text", "text", "text", "skip", "text", "skip", "skip", "date", "date")) %>%
mutate(sampling_time_specific=format(sampling_time_specific, format="%H:%M:%S"))
key
create reformatted tube_no
key <- key %>%
mutate(tube_no_2 = {
tolower(tube_no) %>%
str_replace("q_([1-9](_|$))", "q_00\\1") %>%
str_replace("q_([1-9][0-9](_|$))", "q_0\\1") }) %>%
select(tube_no, tube_no_2, everything())
key
samples <- tibble(
file=files,
tube_no_2 = str_extract(files, pattern = "q_[0-9]{3}(_d8)?")
)
samples
samples <- left_join(samples, key)
Joining with `by = join_by(tube_no_2)`
samples <- samples %>%
mutate(sampling_day=str_pad(sampling_day,width=2,side = "left",pad="0")) %>%
mutate(group=str_c(genotype, soil_trt, sampling_day, sampling_time,sep="-"))
samples
counts2 <- counts %>%
as.data.frame() %>%
column_to_rownames(var = "target_id") %>%
as.matrix() %>%
round(0)
samples2 <- samples %>%
select(-tube_no_2, -tube_no, -pot, -plant_no, -sampling_day_specific, -sampling_time_specific) %>%
as.data.frame() %>%
column_to_rownames(var="file")
dge <- DGEList(counts=counts2, samples=samples2, group=samples2$group)
dge <- calcNormFactors(dge)
save(dge, file="../output/timecourseDGE.Rdata")
load("../output/timecourseDGE.Rdata")
dge$sample
cpm averaged for each sample type
log2cpmGroup <- cpmByGroup(dge, log = TRUE)
dim(log2cpmGroup)
head(log2cpm[,1:10])
write.csv(log2cpmGroup, "../output/log2cpmGroup.csv.gz")
cpm for each individual sample
samplenames <- str_c(dge$samples$group,"_blk",dge$samples$block)
log2cpmSample <- cpmByGroup(dge, log = TRUE, group=samplenames)
dim(log2cpmSample)
head(log2cpmSample[,1:10])
log2cpmSample <- log2cpmSample %>% dplyr::rename("gene"=X1)
write.csv(log2cpmSample, "../output/log2cpmSample.csv.gz")
write_tsv(log2cpmSample, "../output/log2cpmSample.txt.gz") # .txt is compatible to GSEA app.